In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

pd.options.mode.chained_assignment = None

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

import dalex as dx

import pickle
import ast

import warnings
warnings.filterwarnings('ignore')

from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler
In [2]:
clf_results_df = pd.read_csv('CV_results.csv', index_col = 0)
In [3]:
used_metrics = ['roc_auc', 'f1', 'accuracy']
In [4]:
clf_results_df.head()
Out[4]:
mean_fit_time std_fit_time mean_score_time std_score_time param_nrounds param_min_child_weight param_lambda param_eta param_colsample_bytree param_colsample_bylevel ... std_test_f1 rank_test_f1 split0_test_accuracy split1_test_accuracy split2_test_accuracy split3_test_accuracy split4_test_accuracy mean_test_accuracy std_test_accuracy rank_test_accuracy
0 14.686826 1.362751 0.029591 0.000557 2222.0 5.039684 1024.000000 0.214311 0.666667 1.000000 ... 0.017854 12 0.899751 0.895225 0.900860 0.900634 0.903576 0.900009 0.002713 7
1 6.655648 0.159322 0.028604 0.003704 3889.0 43.545280 0.462937 0.099213 0.666667 0.666667 ... 0.033574 28 0.887531 0.885721 0.884563 0.888185 0.888637 0.886927 0.001544 29
2 2.165086 0.115962 0.029119 0.000515 2222.0 5.039684 0.099213 0.004557 0.222222 0.222222 ... 0.005378 32 0.888210 0.886400 0.886600 0.887053 0.886148 0.886882 0.000727 30
3 3.120148 0.107695 0.034262 0.009469 1111.0 14.813995 0.000977 0.002109 0.444444 0.222222 ... 0.009058 29 0.888210 0.887531 0.886374 0.886600 0.887053 0.887154 0.000661 28
4 5.744213 0.020123 0.031193 0.003130 1.0 5.039684 0.462937 0.002109 1.000000 0.222222 ... 0.018855 22 0.888889 0.889115 0.889316 0.889995 0.892938 0.890051 0.001490 19

5 rows × 36 columns

todo

In [5]:
from IPython.display import Image
Image(filename='models.png')
Out[5]:
In [6]:
clf_results_df['clusters'] = np.load('clusters.npy')
In [7]:
pd.set_option('display.max_columns', None)
In [8]:
import re
In [9]:
clf_results_df_str = ' '.join(list(clf_results_df.columns))
params = re.findall("param_[a-z]*_?[a-z]*_?[a-z]*", clf_results_df_str)
mean_test = re.findall("mean_test_[a-z1]*_?[a-z]*_?[a-z]*", clf_results_df_str)
params_mean_test = params + mean_test
params_mean_test
Out[9]:
['param_nrounds',
 'param_min_child_weight',
 'param_lambda',
 'param_eta',
 'param_colsample_bytree',
 'param_colsample_bylevel',
 'param_alpha',
 'mean_test_roc_auc',
 'mean_test_f1',
 'mean_test_accuracy']

Średnie wartości hiperparametrów i wybranych metryk w zależnosci od przynależności modelu do danego klastra

In [10]:
clf_results_df.groupby('clusters').mean()[params_mean_test]
Out[10]:
param_nrounds param_min_child_weight param_lambda param_eta param_colsample_bytree param_colsample_bylevel param_alpha mean_test_roc_auc mean_test_f1 mean_test_accuracy
clusters
0 2287.647059 30.783062 6.364461 0.120043 0.699346 0.509804 105.248315 0.868800 0.235583 0.890602
1 2222.500000 20.500949 1024.000000 0.155843 0.777778 0.462963 81.084522 0.848205 0.173288 0.888648
2 2296.266667 14.192157 7.944270 0.141582 0.170370 0.503704 156.665655 0.855535 0.162305 0.886939
3 2530.777778 20.150052 0.308031 0.389076 0.950617 0.728395 35.098456 0.866610 0.343347 0.893345
4 1852.000000 47.937998 1024.000000 0.048706 0.148148 0.555556 31.387407 0.805591 0.121631 0.885072

Wybieramy najlepsze modele z każdego z otrzymanych klastrów

In [11]:
def find_indexes(X, k):
    return  list(X.loc[X['rank_test_roc_auc'].isin(k),:].index)
In [12]:
indexes_of_best =  find_indexes(clf_results_df, [1,2,3,5,21])
In [13]:
best_params = clf_results_df.iloc[pd.Index(indexes_of_best),:]['params']
In [14]:
best_params.reset_index(drop = True, inplace = True)
In [15]:
for params in best_params:
    print(params, "\n")
{'nrounds': 1.0, 'min_child_weight': 14.813995396596646, 'lambda': 1024.0, 'eta': 0.46293735614364534, 'colsample_bytree': 0.6666666666666666, 'colsample_bylevel': 0.2222222222222222, 'alpha': 0.004556754060844206} 

{'nrounds': 556.0, 'min_child_weight': 74.65785853287147, 'lambda': 0.09921256574801249, 'eta': 0.09921256574801249, 'colsample_bytree': 0.5555555555555556, 'colsample_bylevel': 0.6666666666666666, 'alpha': 0.46293735614364534} 

{'nrounds': 1667.0, 'min_child_weight': 14.813995396596646, 'lambda': 1024.0, 'eta': 0.09921256574801249, 'colsample_bytree': 0.2222222222222222, 'colsample_bylevel': 0.7777777777777777, 'alpha': 47.03150375281921} 

{'nrounds': 1667.0, 'min_child_weight': 2.9394689845511977, 'lambda': 2.1601194777846118, 'eta': 0.09921256574801249, 'colsample_bytree': 0.2222222222222222, 'colsample_bylevel': 0.6666666666666666, 'alpha': 0.004556754060844206} 

{'nrounds': 2222.0, 'min_child_weight': 14.813995396596646, 'lambda': 0.46293735614364534, 'eta': 0.46293735614364534, 'colsample_bytree': 0.8888888888888888, 'colsample_bylevel': 0.8888888888888888, 'alpha': 47.03150375281921} 

Otrzymaliśmy zatem 5 modeli. Czas je stworzyć i nauczyć.

Najpierw musimy jednak wczytać dane

In [16]:
def get_task(path):
    with open(path, 'rb') as f:
        labels = pickle.load(f)
    dct = {'mort':0, 'readmit': 1, 'los': 2, 'dx':3 }
    task = [yy[dct['mort']] for yy in labels]
    return np.array(task)
In [17]:
#Loading data
X = np.load("./local_mimic/save/X48.npy")
Z = np.load("./local_mimic/save/w2v.npy")
y = get_task("./local_mimic/save/y")
In [18]:
#Data transformations
X, Z, y = np.array(X), np.array(Z), np.array(y)
X = np.append(X, Z, axis=1)
In [19]:
X = pd.DataFrame(X)
In [20]:
### ast.literal_eval - zamiana stringa na dict
### ** po to aby xgboost wczytał podane parametry

xgb_1 = XGBClassifier(**ast.literal_eval(best_params[0]))
xgb_2 = XGBClassifier(**ast.literal_eval(best_params[1]))
xgb_3 = XGBClassifier(**ast.literal_eval(best_params[2]))
xgb_4 = XGBClassifier(**ast.literal_eval(best_params[3]))
xgb_5 = XGBClassifier(**ast.literal_eval(best_params[4]))

# widzimy ze xgboost przyjal paramatery ktore powinnien
print(best_params[4])
xgb_5
{'nrounds': 2222.0, 'min_child_weight': 14.813995396596646, 'lambda': 0.46293735614364534, 'eta': 0.46293735614364534, 'colsample_bytree': 0.8888888888888888, 'colsample_bylevel': 0.8888888888888888, 'alpha': 47.03150375281921}
Out[20]:
XGBClassifier(alpha=47.03150375281921, base_score=None, booster=None,
              colsample_bylevel=0.8888888888888888, colsample_bynode=None,
              colsample_bytree=0.8888888888888888, eta=0.46293735614364534,
              gamma=None, gpu_id=None, importance_type='gain',
              interaction_constraints=None, lambda=0.46293735614364534,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=14.813995396596646, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              nrounds=2222.0, num_parallel_tree=None, random_state=None,
              reg_alpha=None, reg_lambda=None, scale_pos_weight=None,
              subsample=None, tree_method=None, validate_parameters=None,
              verbosity=None)
In [21]:
models = [xgb_1, xgb_2, xgb_3, xgb_4, xgb_5]
In [22]:
for model in models:
    model.fit(X,y)
[19:40:23] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:541: 
Parameters: { nrounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[19:40:24] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[19:40:42] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:541: 
Parameters: { nrounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[19:40:43] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[19:41:17] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:541: 
Parameters: { nrounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[19:41:17] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[19:41:37] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:541: 
Parameters: { nrounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[19:41:37] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
[19:41:55] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:541: 
Parameters: { nrounds } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[19:41:57] WARNING: /Users/travis/build/dmlc/xgboost/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
In [23]:
xgb_1_exp = dx.Explainer(xgb_1,
                         X, y, label = "XGB 1")
xgb_2_exp = dx.Explainer(xgb_2,
                         X, y, label = "XGB 2")
xgb_3_exp = dx.Explainer(xgb_3,
                         X, y, label = "XGB 3")
xgb_4_exp = dx.Explainer(xgb_4,
                         X, y, label = "XGB 4")
xgb_5_exp = dx.Explainer(xgb_5,
                         X, y, label = "XGB 5")
Preparation of a new explainer is initiated

  -> data              : 27616 rows 276 cols
  -> target variable   : 27616 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : XGB 1
  -> predict function  : <function yhat_proba_default at 0x7f8dfa055f80> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.000687, mean = 0.118, max = 0.978
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.936, mean = 0.000853, max = 0.997
  -> model_info        : package xgboost

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 27616 rows 276 cols
  -> target variable   : 27616 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : XGB 2
  -> predict function  : <function yhat_proba_default at 0x7f8dfa055f80> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.000951, mean = 0.119, max = 0.981
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.895, mean = -0.000123, max = 0.989
  -> model_info        : package xgboost

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 27616 rows 276 cols
  -> target variable   : 27616 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : XGB 3
  -> predict function  : <function yhat_proba_default at 0x7f8dfa055f80> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.0166, mean = 0.122, max = 0.777
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.747, mean = -0.00262, max = 0.98
  -> model_info        : package xgboost

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 27616 rows 276 cols
  -> target variable   : 27616 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : XGB 4
  -> predict function  : <function yhat_proba_default at 0x7f8dfa055f80> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.000481, mean = 0.119, max = 0.99
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.9, mean = 3.47e-05, max = 0.985
  -> model_info        : package xgboost

A new explainer has been created!
Preparation of a new explainer is initiated

  -> data              : 27616 rows 276 cols
  -> target variable   : 27616 values
  -> model_class       : xgboost.sklearn.XGBClassifier (default)
  -> label             : XGB 5
  -> predict function  : <function yhat_proba_default at 0x7f8dfa055f80> will be used (default)
  -> predict function  : Accepts pandas.DataFrame and numpy.ndarray.
  -> predicted values  : min = 0.000312, mean = 0.119, max = 0.995
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.976, mean = -0.00017, max = 0.994
  -> model_info        : package xgboost

A new explainer has been created!

Sprawdzamy istotność zmiennych

In [24]:
vi_xgb_1 = xgb_1_exp.model_parts()
vi_xgb_2 = xgb_2_exp.model_parts()
vi_xgb_3 = xgb_3_exp.model_parts()
vi_xgb_4 = xgb_4_exp.model_parts()
vi_xgb_5 = xgb_5_exp.model_parts()
In [25]:
vi_xgb_1.plot([vi_xgb_2, vi_xgb_3, vi_xgb_4, vi_xgb_5])
In [26]:
def get_k_best_variables(vi_model, k):
    return set(vi_model.result.sort_values(by = 'dropout_loss', ascending = False)[1:k].variable)
In [27]:
important_variables = set.intersection(get_k_best_variables(vi_xgb_1, 17),
                                      get_k_best_variables(vi_xgb_2, 17),
                                      get_k_best_variables(vi_xgb_3, 17),
                                      get_k_best_variables(vi_xgb_4, 17),
                                      get_k_best_variables(vi_xgb_5, 17))
In [28]:
important_variables
Out[28]:
{48, 73, 253, 257}

Partial-dependence (PD) plots

In [29]:
pd_xgb_1 = xgb_1_exp.model_profile(variables = list(important_variables))
pd_xgb_2 = xgb_2_exp.model_profile(variables = list(important_variables))
pd_xgb_3 = xgb_3_exp.model_profile(variables = list(important_variables))
pd_xgb_4 = xgb_4_exp.model_profile(variables = list(important_variables))
pd_xgb_5 = xgb_5_exp.model_profile(variables = list(important_variables))
Calculating ceteris paribus: 100%|██████████| 4/4 [00:02<00:00,  1.59it/s]
Calculating ceteris paribus: 100%|██████████| 4/4 [00:02<00:00,  1.82it/s]
Calculating ceteris paribus: 100%|██████████| 4/4 [00:02<00:00,  1.68it/s]
Calculating ceteris paribus: 100%|██████████| 4/4 [00:02<00:00,  1.96it/s]
Calculating ceteris paribus: 100%|██████████| 4/4 [00:02<00:00,  1.95it/s]
In [30]:
pd_xgb_1.plot([pd_xgb_2, pd_xgb_3, pd_xgb_4, pd_xgb_5])

Accumulated-local (ALE) Profiles

In [31]:
al_xgb_1 = xgb_1_exp.model_profile(variables = list(important_variables), type = 'accumulated')
al_xgb_2 = xgb_2_exp.model_profile(variables = list(important_variables), type = 'accumulated')
al_xgb_3 = xgb_3_exp.model_profile(variables = list(important_variables), type = 'accumulated')
al_xgb_4 = xgb_4_exp.model_profile(variables = list(important_variables), type = 'accumulated')
al_xgb_5 = xgb_5_exp.model_profile(variables = list(important_variables), type = 'accumulated')
Calculating ceteris paribus: 100%|██████████| 4/4 [00:02<00:00,  1.40it/s]
Calculating accumulated dependency: 100%|██████████| 4/4 [00:02<00:00,  1.96it/s]
Calculating ceteris paribus: 100%|██████████| 4/4 [00:01<00:00,  2.43it/s]
Calculating accumulated dependency: 100%|██████████| 4/4 [00:01<00:00,  2.73it/s]
Calculating ceteris paribus: 100%|██████████| 4/4 [00:01<00:00,  2.78it/s]
Calculating accumulated dependency: 100%|██████████| 4/4 [00:02<00:00,  1.98it/s]
Calculating ceteris paribus: 100%|██████████| 4/4 [00:01<00:00,  2.34it/s]
Calculating accumulated dependency: 100%|██████████| 4/4 [00:01<00:00,  2.52it/s]
Calculating ceteris paribus: 100%|██████████| 4/4 [00:01<00:00,  2.15it/s]
Calculating accumulated dependency: 100%|██████████| 4/4 [00:02<00:00,  1.99it/s]
In [32]:
al_xgb_1.plot([al_xgb_2, al_xgb_3, al_xgb_4, al_xgb_5])
In [33]:
al_models = [al_xgb_1, al_xgb_2, al_xgb_3, al_xgb_4, al_xgb_5]
In [34]:
pd_models = [pd_xgb_1, pd_xgb_2, pd_xgb_3, pd_xgb_4, pd_xgb_5]
In [35]:
for al_model in al_models:
    al_model.result['_label_'] = [x + ' AL profiles' for x in al_model.result['_label_']]
In [36]:
for i in range(5):
    al_models[i].plot(pd_models[i])

Wykresy zależności miary jakości modelu w zależności od zmienianych hiperparametrów

Update - dodaliśmy skalę logarytmiczną

In [37]:
params = ['nrounds', 'min_child_weight', 'lambda', 'eta', 'colsample_bytree', 'colsample_bylevel', 'alpha']
In [38]:
logs = [False, True, True, True, False, False, True]
In [39]:
def plot_one_param(df, param, metrics, log):
    fig, axs = plt.subplots(1, 3, sharey=True, figsize=(8,4))
    for i in range(3):
        sns.lineplot(ax = axs[i], data = df.groupby('param_'+param).mean(), x='param_'+param, y = 'mean_test_' + metrics[i])
        axs[i].set_ylabel('mean test score', fontsize = 14)
        axs[i].set_xlabel("")
        xlabel = "param_"+param
        if log:
            axs[i].set(xscale="log")
            xlabel = "log(" + xlabel+")"
        if i==1:
            axs[i].set_xlabel(xlabel, fontsize = 12)
        axs[i].set_title(metrics[i])
In [40]:
for param, log in zip(params, logs):
    plot_one_param(clf_results_df, param, used_metrics, log)

Requirements:

  • xgboost==1.3.3
  • seaborn==0.11.1
  • scikit-learn==0.24.1
  • pandas==1.2.3
  • numpy==1.20.1
  • matplotlib==3.3.4
  • dalex==1.0.1